1 Data Selection

Important: this dataset has ONLY been filtered for ‘red’ criteria for technical quality.

df <- aoc_z %>% 
  filter(tech.tier.zero == "N") %>% #gives studies that pass technical quality criteria
  filter(risk.tier.zero == "N") %>%  #only studies applicable for risk assessment. VERY RESTRICTIVE 
  dplyr::select(c(organism.group, exposure.duration.d, lvl1_f, dose.um3.mL.master, dose.mg.L.master, dose.particles.mL.master, life.stage, bio.org, polymer, shape, size.length.um.used.for.conversions, treatments, effect, effect.metric, particle.volume.um3, density.mg.um.3, environment, exposure.route, lvl2_f)) %>% 
  mutate(effect.metric = as.character(effect.metric)) %>% 
  mutate(effect.metric = (case_when(
    effect.metric == "NONEC" ~ "NOEC",
    effect.metric == "HONEC" ~ "NOEC",
    effect.metric == "LOEC" ~ "LOEC",
    effect.metric == "LC50" ~ "LC50",
    effect.metric == "EC50" ~"EC50",
    effect.metric == "EC10" ~ "EC10"
  ))) %>% 
  mutate(effect.metric = replace_na(effect.metric,"not_available")) %>% 
  mutate(effect.metric = as.factor(effect.metric)) %>% 
  drop_na() %>%  #drop missing
mutate_if(~is.numeric(.) && (.) > 0, log10) %>% 
  mutate(effect_10 = case_when( #convert ordinal to numeric
      effect == "Y" ~ 1,
      effect == "N" ~ 0
    )) %>%
  mutate(effect_10 = factor(effect_10)) %>% 
  dplyr::select(-(effect))

#ensure completeness
skim(df)
Data summary
Name df
Number of rows 2188
Number of columns 19
_______________________
Column type frequency:
factor 11
numeric 8
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
organism.group 0 1 FALSE 10 Cru: 795, Fis: 528, Mol: 433, Alg: 222
lvl1_f 0 1 FALSE 7 Fit: 1522, Met: 485, Beh: 95, Str: 44
life.stage 0 1 FALSE 4 Ear: 974, Adu: 759, Not: 253, Juv: 202
bio.org 0 1 FALSE 5 org: 1163, sub: 668, cel: 251, tis: 59
polymer 0 1 FALSE 7 PS: 1262, PE: 588, PET: 129, PP: 106
shape 0 1 FALSE 3 sph: 1463, fra: 649, fib: 76, Not: 0
effect.metric 0 1 FALSE 3 not: 1576, NOE: 318, LOE: 294, EC1: 0
environment 0 1 FALSE 3 Mar: 1384, Fre: 770, Ter: 34
exposure.route 0 1 FALSE 3 wat: 2145, med: 34, sed: 9, cop: 0
lvl2_f 0 1 FALSE 25 Mor: 412, Rep: 368, Oxi: 288, Dev: 287
effect_10 0 1 FALSE 2 0: 1504, 1: 684

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
exposure.duration.d 0 1 0.93 0.73 -1.40 0.48 1.00 1.32 1.98 ▁▁▆▆▇
dose.um3.mL.master 0 1 5.84 2.34 -5.67 4.42 6.02 7.37 14.32 ▁▁▇▆▁
dose.mg.L.master 0 1 -0.23 2.20 -11.64 -1.70 -0.30 1.40 8.17 ▁▁▇▆▁
dose.particles.mL.master 0 1 4.24 4.05 -4.20 1.05 3.40 7.20 21.36 ▃▇▃▁▁
size.length.um.used.for.conversions 0 1 0.71 1.32 -4.00 -0.30 0.78 1.45 3.70 ▁▃▅▇▃
treatments 0 1 0.58 0.15 0.00 0.48 0.60 0.60 1.00 ▁▁▆▇▁
particle.volume.um3 0 1 1.60 3.76 -12.28 -1.18 2.05 3.73 10.21 ▁▃▆▇▃
density.mg.um.3 0 1 -8.98 0.04 -9.06 -9.00 -8.97 -8.97 -8.85 ▃▂▇▁▁
# create train, validation, and test splits
# Create calibration and validation splits with tidymodels initial_split() function.
set.seed(4)
df_split <- df %>%
  initial_split(prop = 0.75)
# default is 3/4ths split (but 75% training, 25% testing).

# Create a training data set with the training() function
# Pulls from training and testing sets created by initial_split()
train <- training(df_split)
test <- testing(df_split)

# variable names for resonse & features
y <- "effect_10"
x <- setdiff(names(df), y) 

2 Modelling

2.1 Model Performance

2.1.1 Histogram of Residuals

plot(mp_classif_rf, mp_classif_glm, geom ="histogram")

plot(mp_classif_svm, mp_classif_decTree, geom ="histogram")

plot(mp_classif_nnet, mp_classif_xgbTree, geom ="histogram")

### Precision Recall Curves

plot(mp_classif_rf, mp_classif_glm, mp_classif_svm, mp_classif_decTree, mp_classif_nnet, mp_classif_xgbTree, geom ="prc")

2.1.2 Reverse Cumulative Distribution of Residuals

# compare residuals plots
resid_dist <- plot(mp_classif_rf, mp_classif_glm, mp_classif_svm, mp_classif_decTree, mp_classif_nnet,
                   mp_classif_xgbTree) + #, mp_classif_logicBag) +
  theme_minimal() +
        theme(legend.position = 'bottom',
              plot.title = element_text(hjust = 0.5)) + 
        labs(y = '')
resid_dist

#### Residuals

resid_box <- plot(mp_classif_rf, mp_classif_glm, mp_classif_svm, mp_classif_decTree, mp_classif_nnet, mp_classif_xgbTree,
     #mp_classif_logicBag, 
     geom = "boxplot") +
  theme_minimal() +
        theme(legend.position = 'bottom',
              plot.title = element_text(hjust = 0.5)) 
resid_box

require(gridExtra)
grid.arrange(resid_box,resid_dist, ncol=2)

2.1.3 Lift Curves

Lift curves describe a performance coefficient (lift) over the cumulative proportion of a population. Lift is calculated as the ratio of “yes’s” on a certain sample point (for toxicity) divided by the ratio of “yes’s” on the whole dataset. \(Lift = Predicted Rate/Average Rate\).

2.1.3.0.1 ALT METHOD

http://rstudio-pubs-static.s3.amazonaws.com/436131_3212dcf341cc422590f1a9f52830cfd6.html

# transform datasets and model objects into scored data and calculate deciles 
scores_and_ntiles <- prepare_scores_and_ntiles(datasets=list("train","test"),
                                               dataset_labels = list("train data","test data"),
                                               models = list("classif_rf", "classif_decTree",
                                                             "classif_glm", "classif_nnet",
                                                             "classif_svm", "classif_xgbTree"),
                                               model_labels = list("random forest", "Decision
                                                                   tree", "General linear model",
                                                                   "Neural net", "Support Vector
                                                                   machine", "eXtreme Gradient
                                                                   Boosting Trees"),
                                               target_column="effect_10",
                                               ntiles = 100)


# transform data generated with prepare_scores_and_deciles into aggregated data for chosen plotting scope 
plot_input <- plotting_scope(prepared_input = scores_and_ntiles, scope = 'compare_models')
plot_cumgains(data = plot_input, custom_line_colors = RColorBrewer::brewer.pal(2,'Accent'))

plot_cumlift(data = plot_input,custom_line_colors = RColorBrewer::brewer.pal(2,'Accent'))

plot_cumresponse(data = plot_input,highlight_ntile = 20, 
                 custom_line_colors = RColorBrewer::brewer.pal(2,'Accent'))

plot_multiplot(data = plot_input,  custom_line_colors = RColorBrewer::brewer.pal(2,'Accent'))

2.1.4 Roc Curves

plot(mp_classif_rf, 
     #mp_classif_glm, 
     mp_classif_svm, 
     #mp_classif_decTree, 
     mp_classif_nnet,
     #mp_classif_xgbTree,
     geom = "roc") +
  ggtitle("ROC Curves - All Models",  
          paste("AUC_rf = ",round(mp_classif_rf$measures$auc,3), 
                paste("AUC_svm = ",round(mp_classif_svm$measures$auc,3)),
          paste("AUC_nnet = ",round(mp_classif_nnet$measures$auc,3))
          )) +
#,  "AUC_glm = 0.799  AUC_svm = 0.798 AUC_decTree = AUC_nnet = AUC_xgbTree = ") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

2.1.5 Variable Importance

vi_classif_rf <- model_parts(explainer_classif_rf, loss_function = loss_root_mean_square)
vi_classif_glm <- model_parts(explainer_classif_glm, loss_function = loss_root_mean_square)
vi_classif_svm <- model_parts(explainer_classif_svm, loss_function = loss_root_mean_square)
vi_classif_decTree <- model_parts(explainer_classif_decTree, loss_function = loss_root_mean_square)
vi_classif_nnet <- model_parts(explainer_classif_nnet, loss_function = loss_root_mean_square)
vi_classif_xgbTree <- model_parts(explainer_classif_xgbTree, loss_function = loss_root_mean_square)
#vi_classif_logicBag <- model_parts(explainer_classif_logicBag, loss_function = loss_root_mean_square)

plot(vi_classif_rf, vi_classif_glm, vi_classif_svm)#, #vi_classif_logicBag)

plot(vi_classif_decTree, vi_classif_nnet, vi_classif_xgbTree)

2.1.6 Partial Dependence Plot

pdp_classif_rf  <- model_profile(explainer_classif_rf, variable = "dose.um3.mL.master", type = "partial")
pdp_classif_glm  <- model_profile(explainer_classif_glm, variable = "dose.um3.mL.master", type = "partial")
pdp_classif_svm  <- model_profile(explainer_classif_svm, variable = "dose.um3.mL.master", type = "partial")
pdp_classif_decTree  <- model_profile(explainer_classif_decTree, variable = "dose.um3.mL.master", type = "partial")
pdp_classif_nnet  <- model_profile(explainer_classif_nnet, variable = "dose.um3.mL.master", type = "partial")
pdp_classif_xgbTree  <- model_profile(explainer_classif_xgbTree, variable = "dose.um3.mL.master", type = "partial")
#pdp_classif_logicBag  <- model_profile(explainer_classif_logicBag, variable = "dose.um3.mL.master", type = "partial")

plot(pdp_classif_rf, pdp_classif_glm, pdp_classif_svm, pdp_classif_decTree, pdp_classif_nnet, pdp_classif_xgbTree)#, pdp_classif_logicBag)

#Partial Dependence organism type
pdp_classif_rf  <- model_profile(explainer_classif_rf, variable = "organism.group", type = "partial")
pdp_classif_glm  <- model_profile(explainer_classif_glm, variable = "organism.group", type = "partial")
pdp_classif_svm  <- model_profile(explainer_classif_svm, variable = "organism.group", type = "partial")
pdp_classif_decTree  <- model_profile(explainer_classif_decTree, variable = "organism.group", type = "partial")
pdp_classif_nnet  <- model_profile(explainer_classif_nnet, variable = "organism.group", type = "partial")
pdp_classif_xgbTree  <- model_profile(explainer_classif_xgbTree, variable = "organism.group", type = "partial")
#pdp_classif_logicBag  <- model_profile(explainer_classif_logicBag, variable = "dose.um3.mL.master", type = "partial")


plot(pdp_classif_rf$agr_profiles, pdp_classif_glm$agr_profiles, pdp_classif_svm$agr_profiles, pdp_classif_decTree$agr_profiles, pdp_classif_nnet$agr_profiles, pdp_classif_xgbTree$agr_profiles) +
  ggtitle("Contrastive Partial Dependence Profiles", "") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

2.1.6.0.1 Grouped Partial Dependence Plots
#plot neural net and glm
glm <- plot(pdp_classif_glm$agr_profiles) +
  ggtitle("GLM: Partial Dependence Profiles by Dose and Effect Metric", "") +
  xlab("log10(dose particles/mL)") +
  ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
  scale_color_discrete(name = "Model x Effect Metric") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

nnet <- plot(pdp_classif_nnet$agr_profiles) +
  ggtitle("Neural Net: Partial Dependence Profiles by Dose and Effect Metric", "") +
  xlab("log10(dose particles/mL)") +
  ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
  scale_color_discrete(name = "Model x Effect Metric") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(glm, nnet)

#plot rf and decision tree
rf <- plot(pdp_classif_rf$agr_profiles) +
  ggtitle("Random Forest:Partial Dependence Profiles by Dose and Effect Metric", "") +
  xlab("log10(dose particles/mL)") +
  ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
  scale_color_discrete(name = "Model x Effect Metric") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

decTree <- plot(pdp_classif_decTree$agr_profiles) +
  ggtitle("Decision Tree: Partial Dependence Profiles by Dose and Effect Metric", "") +
  xlab("log10(dose particles/mL)") +
  ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
  scale_color_discrete(name = "Model x Effect Metric") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(rf, decTree)

#plot rf and decision tree
svm <- plot(pdp_classif_svm$agr_profiles) +
  ggtitle("Support Vector Machine:Partial Dependence Profiles by Dose and Effect Metric", "") +
  xlab("log10(dose particles/mL)") +
  ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
  scale_color_discrete(name = "Model x Effect Metric") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

xgbTree <- plot(pdp_classif_xgbTree$agr_profiles) +
  ggtitle("eXtreme Gradient Boosing Trees: Partial Dependence Profiles by Dose and Effect Metric", "") +
  xlab("log10(dose particles/mL)") +
  ylab("Average Prediction for Effect (1 = yes, 0 = no)") +
  scale_color_discrete(name = "Model x Effect Metric") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
grid.arrange(svm, xgbTree)

2.1.7 Confusion Matrices

Confusion Matrices for the six tested models are below.

# fill alpha relative to sensitivity/specificity by proportional outcomes within reference groups (see dplyr code above as well as original confusion matrix for comparison)
CM_rf <- ggplot(data = plotTable_rf, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1) +
  scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
  scale_x_discrete(labels = c("No Effect", "Effect")) +
  scale_y_discrete(labels = c("No Effect", "Effect")) +
  ggtitle("Random Forest",
          paste("Accuracy = ", 100 * round(mp_classif_rf$measures$accuracy,3), "%")) +
  theme_bw() +
    theme(plot.title = element_text(hjust = 0.5),
          legend.position = "none")

CM_glm <- ggplot(data = plotTable_glm, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1) +
  scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
  scale_x_discrete(labels = c("No Effect", "Effect")) +
  scale_y_discrete(labels = c("No Effect", "Effect")) +
  ggtitle("General Linear Model",
          paste("Accuracy = ", 100 * round(mp_classif_glm$measures$accuracy,3), "%")) +
  theme_bw() +
    theme(plot.title = element_text(hjust = 0.5),
          legend.position = "none")

CM_svm <- ggplot(data = plotTable_svm, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1) +
  scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
  scale_x_discrete(labels = c("No Effect", "Effect")) +
  scale_y_discrete(labels = c("No Effect", "Effect")) +
  ggtitle("Support Vector Machine",
          paste("Accuracy = ", 100 * round(mp_classif_svm$measures$accuracy,3), "%")) +
  theme_bw() +
 theme(plot.title = element_text(hjust = 0.5),
          legend.position = "none")

CM_decTree <- ggplot(data = plotTable_decTree, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1) +
  scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
  scale_x_discrete(labels = c("No Effect", "Effect")) +
  scale_y_discrete(labels = c("No Effect", "Effect")) +
  ggtitle("Decision Tree",
          paste("Accuracy = ", 100 * round(mp_classif_decTree$measures$accuracy,3), "%")) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
          legend.position = "none")

CM_nnet <- ggplot(data = plotTable_nnet, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1) +
  scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
  scale_x_discrete(labels = c("No Effect", "Effect")) +
  scale_y_discrete(labels = c("No Effect", "Effect")) +
  ggtitle("Neural Net",
          paste("Accuracy = ", 100 * round(mp_classif_nnet$measures$accuracy,3), "%")) +
  theme_bw() +
    theme(plot.title = element_text(hjust = 0.5),
          legend.position = "none")

CM_xgbTree <- ggplot(data = plotTable_xgbTree, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1) +
  scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
  scale_x_discrete(labels = c("No Effect", "Effect")) +
  scale_y_discrete(labels = c("No Effect", "Effect")) +
  ggtitle("eXtreme Gradient Boosting Trees",
          paste("Accuracy = ", 100 * round(mp_classif_xgbTree$measures$accuracy,3), "%")) +
  theme_bw() +
    theme(plot.title = element_text(hjust = 0.5),
          legend.position = "none")

grid.arrange(CM_rf, CM_glm, CM_svm, CM_decTree, CM_nnet, CM_xgbTree,
             ncol = 2, top = "Confusion Matrices for ML Models")

2.1.8 Breakdown

# create a single observation
new_cust <- test$effect_10 %>% as.data.frame()


# compute breakdown distances
new_cust_glm <- predict_parts(explainer_classif_glm, new_observation = test, type = "break_down")
new_cust_rf <- predict_parts(explainer_classif_rf, new_observation = test, type = "break_down")
new_cust_svm <- predict_parts(explainer_classif_svm, new_observation = test, type = "break_down")
new_cust_decTree <- predict_parts(explainer_classif_decTree, new_observation = test, type = "break_down")
new_cust_nnet <- predict_parts(explainer_classif_nnet, new_observation = test, type = "break_down")
new_cust_xgbTree <- predict_parts(explainer_classif_xgbTree, new_observation = test, type = "break_down")
#new_cust_logicBag <- predict_parts(explainer_classif_logicBag, new_observation = test, type = "break_down")

# class of prediction_breakdown output
class(new_cust_rf)
## [1] "predict_parts" "break_down"    "data.frame"
# check out the top 10 influential variables for this observation
new_cust_rf[1:10, 1:5]
##                                                           contribution
## : intercept                                                      0.283
## : effect.metric = 6                                             -0.055
## : dose.um3.mL.master = 4.02413367973394                          0.045
## : dose.mg.L.master = -1.95467702121334                          -0.061
## : lvl1_f = 5                                                    -0.018
## : treatments = 0.301029995663981                                -0.005
## : shape = 4                                                      0.023
## : dose.particles.mL.master = 3.40204507040936                    0.017
## : size.length.um.used.for.conversions = 0.301029995663981       -0.014
## : bio.org = 2                                                   -0.087
plot(new_cust_glm, new_cust_rf, new_cust_svm, new_cust_decTree, new_cust_nnet, new_cust_xgbTree)#,

     #new_cust_logicBag)
library(ggplot2)

# filter for top 10 influential variables for each model and plot
list(new_cust_glm, new_cust_rf) %>%
  purrr::map(~ top_n(., 11, wt = abs(contribution))) %>%
  do.call(rbind, .) %>%
  mutate(variable = paste0(variable, " (", label, ")")) %>%
  ggplot(aes(contribution, reorder(variable, contribution))) +
  geom_point() +
  geom_vline(xintercept = 0, size = 3, color = "white") +
  facet_wrap(~ label, scales = "free_y", ncol = 1) +
  ylab(NULL)

library(ggplot2)

# filter for top 10 influential variables for each model and plot
list(new_cust_svm, new_cust_decTree) %>%
  purrr::map(~ top_n(., 11, wt = abs(contribution))) %>%
  do.call(rbind, .) %>%
  mutate(variable = paste0(variable, " (", label, ")")) %>%
  ggplot(aes(contribution, reorder(variable, contribution))) +
  geom_point() +
  geom_vline(xintercept = 0, size = 3, color = "white") +
  facet_wrap(~ label, scales = "free_y", ncol = 1) +
  ylab(NULL)

library(ggplot2)

# filter for top 10 influential variables for each model and plot
list(new_cust_nnet,new_cust_xgbTree) %>%
  purrr::map(~ top_n(., 11, wt = abs(contribution))) %>%
  do.call(rbind, .) %>%
  mutate(variable = paste0(variable, " (", label, ")")) %>%
  ggplot(aes(contribution, reorder(variable, contribution))) +
  geom_point() +
  geom_vline(xintercept = 0, size = 3, color = "white") +
  facet_wrap(~ label, scales = "free_y", ncol = 1) +
  ylab(NULL)

3 Final Model

All in all random forest is my final model of choice: it appears the more balanced and is the most accurate overall. This model will now be tuned and refined for maximum performance.

3.1 Tuning

http://rstudio-pubs-static.s3.amazonaws.com/480890_237ad52b09b6440e9c849a3c07a04d2f.html

Plot tuning.

cache = TRUE

my_plot <- function(model) {
    theme_set(theme_minimal())
    u <- model$results %>%
        select(mtry, Accuracy, Kappa, F1, Sensitivity,
              Specificity,Pos_Pred_Value, Neg_Pred_Value, 
               Precision, Recall, Detection_Rate) %>%
        gather(a, b, -mtry)
    
    u %>% ggplot(aes(mtry, b)) + geom_line() + geom_point() + 
        facet_wrap(~ a, scales = "free") + 
        labs(x = "Number of mtry", y = NULL, 
             title = "The Relationship between Model Performance and mtry")
}

rf1 %>% my_plot()

I will further refine this model using recursive feature elimination and compare accuracy with the other models.

my_ctrl <- rfeControl(functions = rfFuncs, #random forests
                      method = "repeatedcv",
                      verbose = FALSE,
                      repeats = 5,
                      returnResamp = "all")

rfProfile <- rfe(y = df$effect_10, # set dependent variable
              x = df %>% 
                dplyr::select(-effect_10),
              rfeControl = my_ctrl,
               size = c(1:2, 4, 6, 8, 10, 12, 13))
rfProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##          1   0.8218 0.5080    0.01679 0.05450         
##          2   0.8207 0.5093    0.01771 0.05582         
##          4   0.9041 0.7710    0.01758 0.04347         
##          6   0.9102 0.7860    0.01643 0.04098         
##          8   0.9133 0.7941    0.01638 0.04017         
##         10   0.9127 0.7927    0.01762 0.04338         
##         12   0.9195 0.8089    0.01590 0.03891         
##         13   0.9187 0.8071    0.01613 0.03958         
##         18   0.9196 0.8097    0.01579 0.03830        *
## 
## The top 5 variables (out of 18):
##    effect.metric, lvl2_f, dose.mg.L.master, dose.particles.mL.master, dose.um3.mL.master

3.2 Predictor Selection

The following variables are those that were picked in the final (most accurate) model.

#get variable names picked in the final model
predictors(rfProfile)
##  [1] "effect.metric"                       "lvl2_f"                             
##  [3] "dose.mg.L.master"                    "dose.particles.mL.master"           
##  [5] "dose.um3.mL.master"                  "organism.group"                     
##  [7] "size.length.um.used.for.conversions" "particle.volume.um3"                
##  [9] "exposure.duration.d"                 "treatments"                         
## [11] "density.mg.um.3"                     "bio.org"                            
## [13] "shape"                               "environment"                        
## [15] "life.stage"                          "lvl1_f"                             
## [17] "polymer"                             "exposure.route"

The first 6 models are shown below with their corresponding accuracy and kappa values.

head(rfProfile$resample)
##   Variables  Accuracy     Kappa .cell1 .cell2 .cell3 .cell4    Resample
## 1         1 0.8165138 0.4906542    150      0     40     28 Fold01.Rep1
## 2         2 0.8119266 0.4853161    148      2     39     29 Fold01.Rep1
## 3         4 0.9311927 0.8364182    145      5     10     58 Fold01.Rep1
## 4         6 0.9174312 0.8045038    143      7     11     57 Fold01.Rep1
## 5         8 0.9357798 0.8479474    145      5      9     59 Fold01.Rep1
## 6        10 0.9357798 0.8479474    145      5      9     59 Fold01.Rep1
trellis.par.set(caretTheme())
#plot(rfProfile, type = c("g", "o"), metric = "Accuracy")
ggplot(rfProfile) + theme_bw()

plot1 <- xyplot(rfProfile, type = c("g", "p", "smooth"), main = "Accuracy of individual resampling results")
plot2 <- densityplot(rfProfile, 
                     subset = Variables < 5, 
                     adjust = 1.25, 
                     as.table = TRUE, 
                     xlab = "Accuracy Estimates", 
                     pch = "|")
print(plot1, split=c(1,1,1,2), more=TRUE)
print(plot2, split=c(1,2,1,2))

3.2.1 Final Predictor Selection

my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 1, maximize = TRUE)
# higher tol (~10) gives you less variables
# lower tol (~1) gives you more variables - "I'd like the simplest model within 1% of the best model."
accuracy1 <- pickVars(rfProfile$variables, size = my_size)
accuracy1
## [1] "effect.metric"                       "lvl2_f"                             
## [3] "dose.mg.L.master"                    "dose.particles.mL.master"           
## [5] "dose.um3.mL.master"                  "organism.group"                     
## [7] "size.length.um.used.for.conversions" "particle.volume.um3"

A random forest model with the above four predictors is within 1% of the best model. If a more accurate model is desired, and data is available, additional factors can be included in the model. Shown below are factors that should be included for a model that is within 0.5% accuracy of the best model.

my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 0.5, maximize = TRUE)
# higher tol (~10) gives you less variables
# lower tol (~1) gives you more variables - "I'd like the simplest model within 1% of the best model."
accuracy0.5 <- pickVars(rfProfile$variables, size = my_size)
accuracy0.5
##  [1] "effect.metric"                       "lvl2_f"                             
##  [3] "dose.mg.L.master"                    "dose.particles.mL.master"           
##  [5] "dose.um3.mL.master"                  "organism.group"                     
##  [7] "size.length.um.used.for.conversions" "particle.volume.um3"                
##  [9] "exposure.duration.d"                 "treatments"                         
## [11] "density.mg.um.3"                     "bio.org"

Below is a table showing which variables are included in models of varying accuracies.

my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 0.1, maximize = TRUE)
accuracy0.1 <- pickVars(rfProfile$variables, size = my_size)
my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 0.3, maximize = TRUE)
accuracy0.3 <- pickVars(rfProfile$variables, size = my_size)
my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 5, maximize = TRUE)
accuracy5 <- pickVars(rfProfile$variables, size = my_size)
my_size <- pickSizeTolerance(rfProfile$results, metric = "Accuracy", tol = 10, maximize = TRUE)
accuracy10 <- pickVars(rfProfile$variables, size = my_size)

varsTable = list('0.1%' =accuracy0.1, '1%' = accuracy1, '5%' = accuracy5, '10%' = accuracy10)

#$make padded dataframe
na.pad <- function(x,len){
    x[1:len]
}

makePaddedDataFrame <- function(l,...){
    maxlen <- max(sapply(l,length))
    data.frame(lapply(l,na.pad,len=maxlen),...)
}
#fancy print
kable(makePaddedDataFrame(
  list('Accuracy (0.1)' = accuracy0.1, 'Accuracy (0.3)' = accuracy0.3, "Accuracy (1)" = accuracy1, 'Accuracy (5)' = accuracy5, 'Accuracy (10)' = accuracy10)
  ),
      caption = "Variables to include in Random Forest to achieve Model Accuracy (within x% of best model)",
    footnote = "Random Forest Recursive Feature Elimination")
Variables to include in Random Forest to achieve Model Accuracy (within x% of best model)
Accuracy..0.1. Accuracy..0.3. Accuracy..1. Accuracy..5. Accuracy..10.
effect.metric effect.metric effect.metric effect.metric effect.metric
lvl2_f lvl2_f lvl2_f lvl2_f lvl2_f
dose.mg.L.master dose.mg.L.master dose.mg.L.master dose.mg.L.master dose.mg.L.master
dose.particles.mL.master dose.particles.mL.master dose.particles.mL.master dose.particles.mL.master dose.particles.mL.master
dose.um3.mL.master dose.um3.mL.master dose.um3.mL.master NA NA
organism.group organism.group organism.group NA NA
size.length.um.used.for.conversions size.length.um.used.for.conversions size.length.um.used.for.conversions NA NA
particle.volume.um3 particle.volume.um3 particle.volume.um3 NA NA
exposure.duration.d exposure.duration.d NA NA NA
treatments treatments NA NA NA
density.mg.um.3 density.mg.um.3 NA NA NA
bio.org bio.org NA NA NA

3.2.2 Final Model Tuning

While the absolute best model contains 18 variables with an estimated accuracy of 93.89%, it is possible to achieve ~93% accuracy with just eight variables, as displayed in the above table. The advantage of including fewer variables is that less information is needed to accurately predict toxicity. The final model with these eight variables will be further refined through an iterative tuning process to determine the optimal number of variables to be randomly collected for sampling at each split (mtry), and the number of branches to grow after each split.Code for the following section is inspired from https://rpubs.com/phamdinhkhanh/389752.

final_df <-df %>% 
  dplyr::select(c(effect.metric, dose.mg.L.master, lvl2_f, dose.particles.mL.master, effect_10, dose.um3.mL.master, organism.group, exposure.duration.d, treatments)) %>% 
  droplevels()
 
#ensure completeness
skim(final_df)
Data summary
Name final_df
Number of rows 2188
Number of columns 9
_______________________
Column type frequency:
factor 4
numeric 5
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
effect.metric 0 1 FALSE 3 not: 1576, NOE: 318, LOE: 294
lvl2_f 0 1 FALSE 25 Mor: 412, Rep: 368, Oxi: 288, Dev: 287
effect_10 0 1 FALSE 2 0: 1504, 1: 684
organism.group 0 1 FALSE 10 Cru: 795, Fis: 528, Mol: 433, Alg: 222

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
dose.mg.L.master 0 1 -0.23 2.20 -11.64 -1.70 -0.30 1.40 8.17 ▁▁▇▆▁
dose.particles.mL.master 0 1 4.24 4.05 -4.20 1.05 3.40 7.20 21.36 ▃▇▃▁▁
dose.um3.mL.master 0 1 5.84 2.34 -5.67 4.42 6.02 7.37 14.32 ▁▁▇▆▁
exposure.duration.d 0 1 0.93 0.73 -1.40 0.48 1.00 1.32 1.98 ▁▁▆▆▇
treatments 0 1 0.58 0.15 0.00 0.48 0.60 0.60 1.00 ▁▁▆▇▁
# create train, validation, and test splits
# Create calibration and validation splits with tidymodels initial_split() function.
set.seed(4)
df_final_split <- final_df %>%
  initial_split(prop = 0.75)
# default is 3/4ths split (but 75% training, 25% testing).

# Create a training data set with the training() function
# Pulls from training and testing sets created by initial_split()
train_final <- training(df_final_split)
test_final <- testing(df_final_split)

# variable names for resonse & features
y <- "effect_10"
x <- setdiff(names(final_df), y) 

The mtry parameter will be optimized below:

tunegrid <- expand.grid(.mtry=(1:7))
                       # , .ntree=c(500, 1000, 1500, 2000, 2500)) #would like to optimize but can't figure out how!

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeat 3 times
                           repeats = 3)
                        
nrow(tunegrid)
## [1] 7
set.seed(825)
tuneFit <- train(effect_10 ~ ., data = train_final, 
                 method = "rf", 
                 trControl = fitControl, 
                 verbose = FALSE, 
                 ## Now specify the exact models 
                 ## to evaluate:
                 metric = 'Accuracy',
                 tuneGrid = tunegrid)
tuneFit
## Random Forest 
## 
## 1641 samples
##    8 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 1477, 1478, 1477, 1476, 1477, 1476, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   1     0.6867798  0.0000000
##   2     0.7576789  0.3134210
##   3     0.8135455  0.5174236
##   4     0.8696117  0.6813289
##   5     0.8864447  0.7264346
##   6     0.8957809  0.7500463
##   7     0.9000418  0.7610493
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.
plot(tuneFit)

Now that mtry is optimized, the number of trees will be optimized, holding mtry constant.

# Manual Search
control <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")
tunegrid <- expand.grid(.mtry=c(7)) #replace with optimal value from previous chunk
modellist <- list()
for (ntree in c(1000, 1500, 2000, 2500)) {
    set.seed(123)
    fit <- train(effect_10~., data=train_final, method="rf", metric='Accuracy', 
                 tuneGrid=tunegrid, trControl=control, ntree=ntree)
    key <- toString(ntree)
    modellist[[key]] <- fit
}
# compare results
results <- resamples(modellist)
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: 1000, 1500, 2000, 2500 
## Number of resamples: 30 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 1000 0.8780488 0.8902439 0.9024390 0.9029182 0.9151515 0.9329268    0
## 1500 0.8780488 0.8902439 0.9024390 0.9031177 0.9148912 0.9272727    0
## 2000 0.8780488 0.8916093 0.9027347 0.9039283 0.9148912 0.9329268    0
## 2500 0.8780488 0.8916093 0.9027347 0.9039283 0.9148912 0.9329268    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 1000 0.7059351 0.7354245 0.7625810 0.7678826 0.8003116 0.8355215    0
## 1500 0.7059351 0.7354245 0.7648221 0.7682879 0.7968108 0.8315180    0
## 2000 0.7059351 0.7389113 0.7678385 0.7702313 0.7968108 0.8355215    0
## 2500 0.7059351 0.7389113 0.7678385 0.7702313 0.7968108 0.8355215    0
dotplot(results)

3.2.3 Final Model

Now that we have optimized the variables for our final model, we will build and save the final simplified model that has the highest accuracy.

tunegrid <- expand.grid(.mtry=c(7))

final_model <- train(effect_10~., data = train_final, method = "rf", ntree = 2500, tuneLength = 5)

# explain
yTest <- as.numeric(as.character(test_final$effect_10))

explainer_final_model <- DALEX::explain(final_model, label = "rf",
                                       data = test_final, 
                                       y = yTest)
## Preparation of a new explainer is initiated
##   -> model label       :  rf 
##   -> data              :  547  rows  9  cols 
##   -> data              :  tibble converted into a data.frame 
##   -> target variable   :  547  values 
##   -> predict function  :  yhat.train  will be used (  default  )
##   -> predicted values  :  numerical, min =  0 , mean =  0.2799678 , max =  0.9976  
##   -> model_info        :  package caret , ver. 6.0.86 , task classification (  default  ) 
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -0.99 , mean =  0.03081828 , max =  0.9896  
##   A new explainer has been created! 
mp_classif_final_model <- model_performance(explainer_final_model)
mp_classif_final_model
## Measures for:  classification
## recall     : 0.7411765 
## precision  : 0.919708 
## f1         : 0.8208469 
## accuracy   : 0.8994516 
## auc        : 0.9550476
## 
## Residuals:
##       0%      10%      20%      30%      40%      50%      60%      70% 
## -0.99000 -0.14400 -0.07208 -0.03008 -0.01200 -0.00520 -0.00160  0.00360 
##      80%      90%     100% 
##  0.06256  0.34920  0.98960

3.2.3.1 Performance

Performance metrics for the final model are below.

3.2.3.1.1 ROC Curve
plot(mp_classif_final_model,
     geom = "roc") +
  ggtitle("ROC Curve - Final Model",  
          paste("AUC = ",round(mp_classif_final_model$measures$auc,2),
                paste("Accuracy = ", 100 * round(mp_classif_final_model$measures$accuracy,3), "%")
          )) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        legend.position = "none")

3.2.3.1.2 Confusion Matrix

The tuned, final, simplified model is now validated using test set data.

test_pred <- predict(final_model, newdata = test_final)
confusionMatrix(table(test_pred, test_final$effect_10))
## Confusion Matrix and Statistics
## 
##          
## test_pred   0   1
##         0 366  44
##         1  11 126
##                                           
##                Accuracy : 0.8995          
##                  95% CI : (0.8711, 0.9234)
##     No Information Rate : 0.6892          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7521          
##                                           
##  Mcnemar's Test P-Value : 1.597e-05       
##                                           
##             Sensitivity : 0.9708          
##             Specificity : 0.7412          
##          Pos Pred Value : 0.8927          
##          Neg Pred Value : 0.9197          
##              Prevalence : 0.6892          
##          Detection Rate : 0.6691          
##    Detection Prevalence : 0.7495          
##       Balanced Accuracy : 0.8560          
##                                           
##        'Positive' Class : 0               
## 

This confusion matrix is plotted below.

table <- data.frame(confusionMatrix(test_pred, test_final$effect_10)$table)

plotTable <- table %>%
  mutate(goodbad = ifelse(table$Prediction == table$Reference, "good", "bad")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))

# fill alpha relative to sensitivity/specificity by proportional outcomes within reference groups (see dplyr code above as well as original confusion matrix for comparison)
ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1) +
  scale_fill_gradient(low = "white", high = "cyan4", name = "Proportion") +
  scale_x_discrete(labels = c("No Effect", "Effect")) +
  scale_y_discrete(labels = c("No Effect", "Effect")) +
  theme_bw()

4 Machine Settings

Time to knit:

all_times
## $`unnamed-chunk-1`
## Time difference of 7.34167 secs
## 
## $`unnamed-chunk-2`
## Time difference of 1.478028 secs
## 
## $`unnamed-chunk-3`
## Time difference of 0.3949451 secs
## 
## $`unnamed-chunk-4`
## Time difference of 0.02393603 secs
## 
## $`All Model Training`
## Time difference of 11.44312 mins
## 
## $`model performance`
## Time difference of 0.0189178 secs
## 
## $`unnamed-chunk-5`
## Time difference of 0.6174459 secs
## 
## $`unnamed-chunk-6`
## Time difference of 0.879647 secs
## 
## $`unnamed-chunk-7`
## Time difference of 0.71808 secs
## 
## $`unnamed-chunk-8`
## Time difference of 0.636302 secs
## 
## $`unnamed-chunk-9`
## Time difference of 1.198795 secs
## 
## $`unnamed-chunk-10`
## Time difference of 0.955446 secs
## 
## $`unnamed-chunk-11`
## Time difference of 1.619668 secs
## 
## $`unnamed-chunk-12`
## Time difference of 0 secs
## 
## $`unnamed-chunk-13`
## Time difference of 5.024565 secs
## 
## $`unnamed-chunk-14`
## Time difference of 0.8028538 secs
## 
## $`unnamed-chunk-15`
## Time difference of 2.41654 secs
## 
## $`unnamed-chunk-16`
## Time difference of 2.858357 secs
## 
## $`unnamed-chunk-17`
## Time difference of 0.595407 secs
## 
## $`unnamed-chunk-18`
## Time difference of 1.390181 mins
## 
## $`unnamed-chunk-19`
## Time difference of 1.522929 secs
## 
## $`unnamed-chunk-20`
## Time difference of 6.398894 secs
## 
## $`unnamed-chunk-21`
## Time difference of 2.152245 secs
## 
## $`unnamed-chunk-22`
## Time difference of 6.273228 secs
## 
## $`unnamed-chunk-23`
## Time difference of 1.048199 secs
## 
## $`unnamed-chunk-24`
## Time difference of 1.117015 secs
## 
## $`unnamed-chunk-25`
## Time difference of 0.9175479 secs
## 
## $`unnamed-chunk-26`
## Time difference of 0.7290499 secs
## 
## $`unnamed-chunk-27`
## Time difference of 1.683497 secs
## 
## $`unnamed-chunk-28`
## Time difference of 19.08298 secs
## 
## $`unnamed-chunk-29`
## Time difference of 0.5614989 secs
## 
## $`unnamed-chunk-30`
## Time difference of 0.6293161 secs
## 
## $`unnamed-chunk-31`
## Time difference of 0.5734668 secs
## 
## $`unnamed-chunk-32`
## Time difference of 0.4956751 secs
## 
## $Tuning
## Time difference of 37.17174 mins
## 
## $`unnamed-chunk-33`
## Time difference of 0.919852 secs
## 
## $`Recursive Feature Elimination`
## Time difference of 8.896982 mins
## 
## $`unnamed-chunk-34`
## Time difference of 0 secs
## 
## $`unnamed-chunk-35`
## Time difference of 0.01096988 secs
## 
## $`unnamed-chunk-36`
## Time difference of 0.2988698 secs
## 
## $`unnamed-chunk-37`
## Time difference of 0.5814481 secs
## 
## $`unnamed-chunk-38`
## Time difference of 0.02293897 secs
## 
## $`unnamed-chunk-39`
## Time difference of 0.05186296 secs
## 
## $`unnamed-chunk-40`
## Time difference of 0.05784392 secs
## 
## $`unnamed-chunk-41`
## Time difference of 0.1785231 secs
## 
## $`unnamed-chunk-42`
## Time difference of 0.02094388 secs
## 
## $`unnamed-chunk-43`
## Time difference of 4.532175 mins
## 
## $`unnamed-chunk-44`
## Time difference of 0.267421 secs
## 
## $`unnamed-chunk-45`
## Time difference of 11.49029 mins
## 
## $`unnamed-chunk-46`
## Time difference of 0.222404 secs
## 
## $`Final Model`
## Time difference of 17.93435 mins
## 
## $`unnamed-chunk-47`
## Time difference of 0.2544019 secs
## 
## $`unnamed-chunk-48`
## Time difference of 0.1605709 secs
## 
## $`unnamed-chunk-49`
## Time difference of 0.697135 secs

Machine Info:

Sys.info()[c(1:3,5)]
##       sysname       release       version       machine 
##     "Windows"      "10 x64" "build 18363"      "x86-64"

Session Info:

sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] parallel  grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3     modelplotr_1.1.0  doParallel_1.0.16 iterators_1.0.13 
##  [5] foreach_1.5.1     knitr_1.31        ggeffects_1.0.1   skimr_2.1.2      
##  [9] caret_6.0-86      tigerstats_0.3.2  abd_0.2-8         mosaic_1.8.3     
## [13] ggridges_0.5.3    mosaicData_0.20.2 ggformula_0.10.1  ggstance_0.3.5   
## [17] Matrix_1.3-2      lattice_0.20-41   nlme_3.1-152      forcats_0.5.1    
## [21] stringr_1.4.0     purrr_0.3.4       readr_1.4.0       tidyr_1.1.2      
## [25] tibble_3.0.6      ggplot2_3.3.3     tidyverse_1.3.0   DALEX_2.0.1      
## [29] dplyr_1.0.4       rsample_0.0.9    
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1         backports_1.2.1      plyr_1.8.6          
##   [4] repr_1.1.3           splines_4.0.4        crosstalk_1.1.1     
##   [7] listenv_0.8.0        leaflet_2.0.4.1      digest_0.6.27       
##  [10] htmltools_0.5.1.1    magrittr_2.0.1       mosaicCore_0.9.0    
##  [13] ggfittext_0.9.1      recipes_0.1.15       globals_0.14.0      
##  [16] modelr_0.1.8         gower_0.2.2          colorspace_2.0-0    
##  [19] rvest_0.3.6          ggrepel_0.9.1        haven_2.3.1         
##  [22] xfun_0.21            crayon_1.4.1         jsonlite_1.7.2      
##  [25] libcoin_1.0-8        survival_3.2-7       glue_1.4.2          
##  [28] polyclip_1.10-0      gtable_0.3.0         ipred_0.9-9         
##  [31] kernlab_0.9-29       scales_1.1.1         mvtnorm_1.1-1       
##  [34] DBI_1.1.1            Rcpp_1.0.6           Cubist_0.2.3        
##  [37] Formula_1.2-4        stats4_4.0.4         lava_1.6.8.1        
##  [40] prodlim_2019.11.13   htmlwidgets_1.5.3    httr_1.4.2          
##  [43] RColorBrewer_1.1-2   ellipsis_0.3.1       pkgconfig_2.0.3     
##  [46] manipulate_1.0.1     farver_2.0.3         nnet_7.3-15         
##  [49] sass_0.3.1           dbplyr_2.1.0         tidyselect_1.1.0    
##  [52] labeling_0.4.2       rlang_0.4.10         reshape2_1.4.4      
##  [55] munsell_0.5.0        cellranger_1.1.0     tools_4.0.4         
##  [58] xgboost_1.3.2.1      cli_2.3.0            generics_0.1.0      
##  [61] sjlabelled_1.1.7     broom_0.7.4          evaluate_0.14       
##  [64] ggdendro_0.1.22      yaml_2.2.1           ModelMetrics_1.2.2.2
##  [67] fs_1.5.0             randomForest_4.6-14  future_1.21.0       
##  [70] xml2_1.3.2           compiler_4.0.4       rstudioapi_0.13     
##  [73] e1071_1.7-4          reprex_1.0.0         tweenr_1.0.1        
##  [76] bslib_0.2.4          stringi_1.5.3        iBreakDown_1.3.1    
##  [79] highr_0.8            vctrs_0.3.6          pillar_1.4.7        
##  [82] lifecycle_1.0.0      furrr_0.2.2          jquerylib_0.1.3     
##  [85] data.table_1.13.6    insight_0.13.0       R6_2.5.0            
##  [88] C50_0.1.3.1          parallelly_1.23.0    codetools_0.2-18    
##  [91] MASS_7.3-53          assertthat_0.2.1     withr_2.4.1         
##  [94] hms_1.0.0            rpart_4.1-15         labelled_2.7.0      
##  [97] timeDate_3043.102    class_7.3-18         rmarkdown_2.7       
## [100] inum_1.0-3           ingredients_2.0.1    partykit_1.2-12     
## [103] ggforce_0.3.2        pROC_1.17.0.1        lubridate_1.7.9.2   
## [106] base64enc_0.1-3
 

A work by Scott Coffin

Scott.Coffin@waterboards.ca.gov